Background and objectives

Visualize the empirical distribution function of count in each individual. We are interested to learn distribution of UMI counts without log transformation. When we visualize density distributions on a log scale, there appear to be multiple modes in the distribution. The mode with the lowest expression value corresponds to cells with zero UMI count.

For simplicity, here I selected a single replicate from each individaul cell line and compare count distributions across individuals.

Thoughts

  1. We didn’t observe bimodality of gene expression of the 16 pluripotent genes in our data. This could be due to the fact that our iPSC cell lines are controlled in an undifferentiated and homogeneous state. In the case when differentiation is expected in the cells, we should see a pattern of bimodality in expression distribution.

  2. Density plot is not a useful tool in visuazling count distribution. They create a visual illusion of multiple modes in the distribution when there are big increase or decrease in frequency of the counts.

  3. Replicates are consisent in count distributions in each individual cell line.

Setting up

library(Humanzee)

Import list of pluripotent genes.

pluripotent_genes <- read.table("../data/gilad-2015/pluripotency-genes.txt",
                                header = TRUE,
                                stringsAsFactors = FALSE, sep = "\t")
head(pluripotent_genes)
##     From              To      Species
## 1   GDF3 ENSG00000184344 Homo sapiens
## 2  ZFP42 ENSG00000179059 Homo sapiens
## 3  NR6A1 ENSG00000148200 Homo sapiens
## 4   SOX2 ENSG00000181449 Homo sapiens
## 5 CDKN2A ENSG00000147889 Homo sapiens
## 6 POU5F1 ENSG00000204531 Homo sapiens
##                                                             Gene.Name
## 1                                     growth differentiation factor 3
## 2                              zinc finger protein 42 homolog (mouse)
## 3                     nuclear receptor subfamily 6, group A, member 1
## 4                                SRY (sex determining region Y)-box 2
## 5 cyclin-dependent kinase inhibitor 2A (melanoma, p16, inhibits CDK4)
## 6                                              POU class 5 homeobox 1

Import filtered molecule counts

molecules_filter <- read.table("../data/gilad-2015/molecules-filter.txt",
                               header = TRUE,
                               stringsAsFactors = FALSE)
which_pluripotent <- which(rownames(molecules_filter) %in% pluripotent_genes$To)
molecules_pluripotent <- molecules_filter[which_pluripotent, ]

Subset pluripotent genes that are mapped in our data.

pluripotent_subset <- pluripotent_genes[
    which(pluripotent_genes$To %in% rownames(molecules_filter)), ]

Import annotations of the filtered data

anno_filter <- read.table("../data/gilad-2015/annotation-filter.txt",
                               header = TRUE,
                               stringsAsFactors = FALSE)
dim(anno_filter)
## [1] 560   5

Plots of replicate 3

Plot for a single replicate and all three individuals.

anno_filter_r3 <- anno_filter[ which(anno_filter$replicate == "r3"), ]
gene_counts_r3 <- molecules_pluripotent[ , anno_filter_r3$replicate == "r3"]
ngenes <- dim(gene_counts_r3)[1]

        
par(mfrow = c(1,3))
for (which_gene_view in c(1:ngenes)) {
    cols <- c("red", "green", "blue")
    individuals <- unique(anno_filter$individual)
    
    ## ECDF of all three individuals
    plot( ecdf( unlist(
        gene_counts_r3[ which_gene_view, 
                              anno_filter_r3$individual == individuals[1] ] ) ),
        main = "", col = cols[1] )
    for (ii_individual in c(2:length(individuals))) {
    lines( ecdf( unlist( gene_counts_r3[ which_gene_view, 
              anno_filter_r3$individual == individuals[ii_individual] ]) ), 
              col = cols[ii_individual] ) 
    }
    
    ## Density plot of all three individuals
    plot_density_overlay(gene_counts_r3,
                 annotation = anno_filter_r3,
                 which_gene = rownames(gene_counts_r3)[which_gene_view],
                 gene_symbols = NULL,
                 labels = "")
    
    ## Simple bar plot of counts
    count_lims <- table( unlist( gene_counts_r3[ which_gene_view, ] ) )
    ylims <- c(0, max(count_lims))
    xlims <- range(as.numeric(names(count_lims)))
    plot( table( unlist(
        gene_counts_r3[ which_gene_view, 
                              anno_filter_r3$individual == individuals[1] ] ) ),
        type = "h", xlab = "Count", ylab = "Frequency", 
        main = "", col = alpha( cols[1], .5),
        xlim = xlims, ylim = ylims, axes = F )
    axis(1); axis(2)
    for (ii_individual in c(2:length(individuals))) {
    points( table( unlist(
            gene_counts_r3[ which_gene_view, 
                              anno_filter_r3$individual == individuals[ii_individual] ] ) ),
            type = "h",
            col = alpha( cols[ii_individual], .5) )
    }

}
## Warning: package 'broman' was built under R version 3.2.3

Plots of all replicates

Plot for a single replicate and all three individuals.

anno_filter_r3 <- anno_filter[ which(anno_filter$replicate == "r3"), ]
gene_counts_r3 <- molecules_pluripotent[ , anno_filter_r3$replicate == "r3"]
ngenes <- dim(gene_counts_r3)[1]

        
par(mfrow = c(3,3))
for (which_gene_view in c(1:ngenes)) {
    cols <- c("red", "green", "blue")
    individuals <- unique(anno_filter$individual)
    replicates <- unique(anno_filter$replicate)[order(unique(anno_filter$replicate))]

    ## Simple bar plot of counts
    count_lims <- lapply(1:length(individuals), function(which_individual) {
        ind_lims <- lapply(1:length(replicates), function(which_replicate) {
            replicate_lims <- unlist( table( as.matrix(
                molecules_pluripotent[ which_gene_view, 
                        anno_filter$individual == individuals[which_individual] &
                        anno_filter$replicate == replicates[which_replicate] ] ) ) )
            if (length(replicate_lims) != 0) {
                list(ylims = max(replicate_lims),
                     xlims = range(as.numeric(names(replicate_lims)) ) )
            } else {
                list(ylims = 0,
                     xlims = 0)        
            }
        })
        list(ylims = sapply(ind_lims, "[[", 1),
             xlims = sapply(ind_lims, "[[", 2))
    }) 
    
    ylims <- c(0, max( unlist(sapply(count_lims, "[[", 1)) ) )
    xlims <- range(  unlist(sapply(count_lims, "[[", 2)) )
    
    for (ii_individual in 1:length(individuals)) {
    
    df_ind <- molecules_pluripotent[ , anno_filter$individual == individuals[ii_individual]]
    anno_ind <- anno_filter[anno_filter$individual == individuals[ii_individual], ]
    for (ii_replicate in 1:length(replicates)) {
    if (sum(anno_ind$replicate == replicates[ii_replicate]) != 0) {
    plot( table( unlist(
        df_ind[ which_gene_view, anno_ind$replicate == replicates[ii_replicate]] ) ),
        type = "h", xlab = "Count", ylab = "Frequency", 
        main = "", col = alpha( cols[ii_individual], 1),
        xlim = xlims, ylim = ylims)
    } else {
    plot(0, pch = "", axes = F, xlab = "", ylab = "")
    }
    title(paste(individuals[ii_individual],
                replicates[ii_replicate], sep = "."))
    }
    }
title(rownames(molecules_pluripotent)[which_gene_view], 
      outer = TRUE, 
      line = -1)
}

Gene info

library(knitr)
## Warning: package 'knitr' was built under R version 3.2.3
kable(pluripotent_subset)
From To Species Gene.Name
1 GDF3 ENSG00000184344 Homo sapiens growth differentiation factor 3
2 ZFP42 ENSG00000179059 Homo sapiens zinc finger protein 42 homolog (mouse)
3 NR6A1 ENSG00000148200 Homo sapiens nuclear receptor subfamily 6, group A, member 1
4 SOX2 ENSG00000181449 Homo sapiens SRY (sex determining region Y)-box 2
6 POU5F1 ENSG00000204531 Homo sapiens POU class 5 homeobox 1
10 MYC ENSG00000136997 Homo sapiens v-myc myelocytomatosis viral oncogene homolog (avian)
11 DNMT3B ENSG00000088305 Homo sapiens DNA (cytosine-5-)-methyltransferase 3 beta
12 TERT ENSG00000164362 Homo sapiens telomerase reverse transcriptase
14 FOXD3 ENSG00000187140 Homo sapiens forkhead box D3
16 DPPA5 ENSG00000203909 Homo sapiens developmental pluripotency associated 5
17 NANOG ENSG00000111704 Homo sapiens Nanog homeobox pseudogene 8; Nanog homeobox
19 PODXL ENSG00000128567 Homo sapiens podocalyxin-like
21 DPPA4 ENSG00000121570 Homo sapiens developmental pluripotency associated 4
23 DPPA2 ENSG00000163530 Homo sapiens developmental pluripotency associated 2
24 CDKN1A ENSG00000124762 Homo sapiens cyclin-dependent kinase inhibitor 1A (p21, Cip1)
25 SALL4 ENSG00000101115 Homo sapiens sal-like 4 (Drosophila)

Session information

sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.12.3   broman_0.62-1  scales_0.3.0   Humanzee_0.1.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.3      assertthat_0.1   digest_0.6.9     plyr_1.8.3      
##  [5] formatR_1.2.1    magrittr_1.5     evaluate_0.8     highr_0.5.1     
##  [9] stringi_1.0-1    rmarkdown_0.9.2  tools_3.2.1      stringr_1.0.0   
## [13] munsell_0.4.3    yaml_2.1.13      colorspace_1.2-6 htmltools_0.3